package com.sima.crawler;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

/**
 * Created by maple on 2017-04-28.
 */
public class GankRepoPageProcessor implements PageProcessor {
    //抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(2000);

    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
    public void process(Page page) {
        //定义如何抽取页面信息
        //爬取干货集中营历史数据，http://gank.io/2017/04/26
        page.addTargetRequests(page.getHtml().links().regex("(http://gank\\.io/\\d+/\\d+/\\d+)").all());
        page.putField("title", page.getHtml().$("h1").toString());//获取标题
        page.putField("content", page.getHtml().$("div.outlink").toString());//获取页面内容
        if (page.getResultItems().get("title") == null) {
            //跳过没有数据的页面
            page.setSkip(true);
        }
    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new GankRepoPageProcessor())
                .addUrl("http://gank.io")//从该url开始
                .addPipeline(new GankDaoPipeline())
                .thread(5)
                .run();
    }
}
